/**********************************************************************/
/*
   Author: Michelle Han
   Created: 5 December, 2022
   Update: Apr 2024
   Description: Cleans raw February 2021 SAKERNAS data.

   This cleaning file should output 2 different datasets:
   1. Cleaned SAKERNAS data:
   sak_feb21_deid_clean
   2. Subset of SAKERNAS person-batch data matched with PMO for SAKERNAS analysis:
   sak_feb21_deid_clean_merged
*/
/**********************************************************************/

*******************************************
* Setup

* include filepaths 
  if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
  * Log
  cap log close
  global prefix: display %tdCYND td(`c(current_date)')
  log using "$KP_logs/${prefix}_clean_SAKERNAS_feb2021.txt", text replace


  clear
  set more off

  u "$KP_deid_sakernas/Raw/SAKERNAS_PRAKERJA_21FEB_deid.dta", clear

  rename final_weig weight

  /*----------------------------------------------------*/
                /* Section: Demographics */
  /*----------------------------------------------------*/

  * create HH id
    egen hh_id = group(id_nks no_dsrt)
    summ hh_id
    rename hh_id hh_id_s // got this from Nikhil's code, not sure why
    di `r(N)' / `r(max)'

  * birth year
    gen year_dob_sak = k5_th
    tab year_dob_sak
    replace year_dob_sak = . if year_dob_sak == 9999

  * birth month
    gen month_dob_sak = k5_bl
    tab month_dob_sak
    replace month_dob_sak = . if month_dob_sak == 99

  * age
    summ k6
    gen age_sak = k6
    recode age_sak (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat_sak)

  * Dummy for age <=30 
    gen young = 0 
    replace young = 1 if age_sak <= 30
    la def young 0 "Over 30 Years Old" 1 "30 and Under"
    la val young young 	

  * number of HH members
    gen hh_size_sak = jlh_art
    gen hh_size_sak_5 =jlh_art- jlh_art5


  * relationship to HH head
    tab k3 , m
    gen relation_hh_head = k3
    la def relations 1 "HH head" 2 "Spouse" 3 "Son/Daughter" 4 "Step/adopted child" 5 "Son/Daughter-in-law" 6 "Grandchild" 7 "Parent/Parent-in-law" 8 "Other family" 9 "Housemaid" 10 "Driver/Gardener" 11 "No relation", replace
    la val relation_hh_head relations
    tab relation_hh_head

  * order in roster
    isid urutan
    bysort hh_id_s (urutan): gen roster_first = _n == 1
    tab relation_hh_head roster_first

  * marital status
    tab r4  , m
    gen married = r4 == 2
    gen divorced = r4 == 3
    gen widowed = r4 == 4
    gen single = r4 == 1

  * gender
    tab k4 , m
    gen female = k4 == 2

  * gender (version 2)
    gen gender = k4 == 1
    label def gender 1 "male"  0 "female"
    label val gender gender

  * current student
    tab r5, m
    gen current_student = r5 == 2

  * education levels
    tab r6a, m
    gen no_elementary = r6a == 1
    gen elementary = r6a == 2
    gen junior_high = r6a == 3
    gen high_school = r6a == 4 | r6a == 5
    gen tertiary = 6 <= r6a & r6a <= 8
    summ no_elementary elementary junior_high high_school tertiary

  * education -> years of schooling
    gen educ = r6a
    gen school_years = 3 if educ == 1 // No elementary
    replace school_years = 6 if inlist(educ, 2) // elementary
    replace school_years = 9 if inlist(educ, 3) // junior high
    replace school_years = 12 if inlist(educ, 4, 5) // high (8-11)
    replace school_years = 14 if educ == 6 // Diploma I/II/III
    replace school_years = 16 if educ == 7 // Diploma IV
    replace school_years = 18 if educ == 8 // S1/S2/S3

  * Dummy for educated (graduating high school)
    gen educated = school_years > 12 
    la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
    la val educated educated

  * training courses: ever done a training course with certificate?
    tab r6d, m
    gen train_certif = r6d == 1

    tab r6e, m
    gen current_training = r6e == 1

  * live in the same kabupaten you were born in?
    gen born_kab = r7_kab // use kabupaten where person lived 5 years ago
    *gen born_live_same = born_kab == kode_kab
    gen born_prov = r7_prov

  * migration
    tab r7, m
    gen migrated = r7 == 2

  * disabilities
    tab r8a, m
    gen vision_disabled = r8a == 2 | r8a == 3
    gen vision_disability = r8a

    tab r8b, m
    gen hearing_disabled = r8b == 2 | r8b == 3
    gen hearing_disability = r8b

    tab r8c, m
    gen walk_disabled = r8c == 2 | r8c == 3
    gen walk_disability = r8c

    tab r8d, m
    gen hand_disabled = r8d == 2 | r8d == 3
    gen hand_disability = r8d

    tab r8e, m
    gen speech_disabled = r8e == 2 | r8e == 3
    gen speech_disability = r8e

    tab r8f, m
    gen other_disabled = r8f == 2 | r8f == 3
    gen other_disability = r8f
    summ *disabled

    gen any_disability = (vision_disabled + hearing_disabled + walk_disabled + hand_disabled + speech_disabled + other_disabled) >= 1
    tab any_disability

    gen severe_disability = r8a == 3 | r8b == 3 | r8c == 3 | r8d == 3 | r8e == 3 | r8f == 3
    tab severe_disability


  * urban-rural
    tab klasifikas, m
    gen urban_sak = klasifikas == 1

  * java
    tab kode_prov
    gen java_sak = inrange(kode_prov, 31, 36)

 * City 5 yrs ago
    gen city_sak_5 = born_kab/10  > =7  & born_kab/10 < 8

  * Java 5 yrs ago
    gen java_sak_5 = inrange(born_prov, 31, 36)


  /*----------------------------------------------------*/
                  /* Section: Employment */
  /*----------------------------------------------------*/

  * employed
    * (worked at least 1 hour in past week)
    tab r9a, m
    gen employed = r9a == 1

    * did any income-generating activity
    tab r9b, m
    replace employed = 1 if r9b == 1

    * helped with work
    tab r9c, m
    replace employed = 1 if r9c == 1

    * temporarily not working
    tab r10a, m
    replace employed = 1 if r10a == 1
    tab employed, m

  * employment status
    tab r12a, m
    gen employment_status = r12a
    label define status 0 "Not Working" 1 "Self-employed" 2 "Business owner with temporary/unpaid workers" 3 "Business owner with paid workers" 4 "Employee" 5 "Temporary worker in agriculture" 6 "Temporary worker (non-agriculture)" 7 "Family/unpaid worker", replace
    la val employment_status status
    tab employment_status, m

  * Type of work: the categories including permanent job, self employed, business owner, etc (12A)
    gen self_employed = employment_status == 1
    gen business_owner = inlist(employment_status, 2, 3)
    gen perm_worker = employment_status == 4
    gen temp_worker = inlist(employment_status, 5, 6)
    gen family_unpaid = employment_status == 7
    gen self_emp_bus_owner = inlist(employment_status,1,2,3)

    * temporarily not working
    gen temp_not_work = r10a
    replace temp_not_work = 0 if temp_not_work!=1

  * were you employed last week?
    replace employed = inlist(employment_status, 1, 2, 3, 4, 5, 6)
    tab employed, m
    tab employed employment_status, m row

  * Business field
    // 17-sector classification: (https://www.bps.go.id/statictable/2016/01/06/1898/klasifikasi-17-sektor-tabel-input-output-indonesia-2010.html)
    tab r13a_kateg, m
    la def business_fields 1 "Agriculture, Forestry and Fisheries" 2 "Mining and excavation" 3 "Industrial Production" 4 "Electricity and Gas" ///
      5 "Water Supply, Waste Management, and Recycling" 6 "Construction" 7 "Wholesale and Retail Trade, Car and Motorcycle Repair" ///
      8 "Transportation and Warehousing" 9 "Hospitality and Restaurants" 10 "Infomation and Communication" 11 "Financial Services and Insurance" ///
      12 "Real Estate" 13 "Business Services" 14 "Government Administration, Defense and Social Security" 15 "Education" 16 "Health and Social Services" 17 "Other", replace
    gen business_field = r13a_kateg if employed == 1
    la val business_field business_fields
    tab business_field, m

  * KBJI code (https://www.bps.go.id/website/fileMenu/KBJI-2014.pdf)
    // attempt to create two-digit KBJI code
    tab r13b_kbji1, m
    tab r13b_kbji2, m
    la def occupations 0 "Army and Police" 1 "Manager" 2 "Professional" 3 "Technicians and Assistant Professionals" 4 "Administrative Personnel" 5 "Business Services and Sales Personnel" 6 "Skilled Workers in Agriculture, Forestry and Fisheries" 7 "Production, Craft, and Related Workers" 8 "Machine Operators and Assembly Workers" 9 "Blue-collar workers", replace
    gen occupation = r13b_kbji2 if employed == 1
    la val occupation occupations
    tab occupation, m


  * wage last month (assuming this is July)
    summ r14a1 r14a2 if inlist(employment_status, 1, 4, 5, 6)
    tab r14a1 if r14a1 < 0
    // NOTE: there are 3 negatives. Since this includes some self-employed respondents, it's possible their costs exceed revenues. Keep for now

  * Earnings including business ownership
    gen earnings = r14a1 + r14a2 if inlist(employment_status, 1, 4, 5, 6)
    summ earnings if inlist(employment_status, 1, 4, 5, 6), d
    count if mi(earnings)

  * Individual earnings, including zero if not employed
    gen ind_earnings = earnings
    replace ind_earnings = 0 if employed == 0
    sum ind_earnings, d

    gen ind_adj_earnings = ind_earnings
    sum ind_adj_earnings if ind_adj_earnings > 0, d
    replace ind_adj_earnings = r(p99) if ind_adj_earnings > r(p99) & !mi(ind_adj_earnings)
    replace ind_adj_earnings = r(p1) if ind_adj_earnings < r(p1) & !mi(ind_adj_earnings) & ind_adj_earnings != 0
    replace ind_adj_earnings = ind_adj_earnings/1000
    la var ind_adj_earnings "Adjusted Earnings (in 1000s of rupiah, winsorized of 1st and 99th pct)"
    sum ind_adj_earnings, d
    gen ind_pos_adj_earnings = ind_adj_earnings if ind_adj_earnings > 0

    gen ind_earnings_pos = ind_earnings > 0 if !mi(ind_earnings)

  * hours worked last week
    summ r16a? if inlist(employment_status, 1, 4, 5, 6)
    summ r16a2
    count if r16a2  > (7*14)
    hist r16a2
    gen hours_worked_raw = r16a2
    summ hours_worked_raw if hours_worked_raw != 0

    * hours worked pre covid week
    summ r16b2 if inlist(employment_status, 1, 4, 5, 6)
    summ r16b2
    count if r16b2  > (7*14)
    hist r16b2
    gen hours_worked_raw_precovid = r16b2
    summ hours_worked_raw_precovid if hours_worked_raw_precovid != 0

  * hours worked, truncated at 14 hours per day
    summ r16a2 if inlist(employment_status, 1, 4, 5, 6)
    gen hours_worked = r16a2
    replace hours_worked = 14*7 if r16a2 > 14*7 & r16a2 != .
    replace hours_worked = 0 if employed == 0
    summ hours_worked_raw hours_worked

  * hours worked, truncated at 14 hours per day
  gen hours_worked_precovid = hours_worked_raw_precovid
  replace hours_worked_precovid = 14*7 if hours_worked_raw_precovid > 14*7 & hours_worked_raw_precovid != .
  replace hours_worked_precovid = 0 if employed == 0
  summ hours_worked_raw_precovid hours_worked_precovid

  * hourly wage (income per day / hours per day)
  * raw wages and hours
    gen hourly_wage_raw = (earnings / 31) / (hours_worked_raw / 7)
    summ hourly_wage_raw, d

  * Hourly wage, including business owners
  * Income winsorized to 99th percentile
    gen hourly_wage = (ind_adj_earnings/31)/(hours_worked / 7) if hours_worked != 0
    replace hourly_wage = 0 if hours_worked == 0
    gen poswage = hourly_wage if hourly_wage > 0

  * winsorize 1% and 99% percentiles
    gen hourly_adj_wage = hourly_wage
    sum hourly_adj_wage if hourly_adj_wage > 0, d
    replace hourly_adj_wage = r(p99) if hourly_adj_wage > r(p99) & !mi(hourly_adj_wage)
    replace hourly_adj_wage = r(p1) if hourly_adj_wage < r(p1) & !mi(hourly_adj_wage) & hourly_adj_wage != 0
    la var hourly_adj_wage "Adjusted Hourly Wage (in 1000s of rupiah, winsorized of 1st and 99th pct)"    
    sum hourly_adj_wage, d

  * Positive wages
    gen poswage_adj = hourly_adj_wage if hourly_adj_wage > 0
    gen poswage_dummy = hourly_adj_wage > 0 if !mi(hourly_adj_wage)

  * Replace wages as missing if no hours are worked
    gen hourly_wage_no0 = hourly_wage
    replace hourly_wage_no0 = . if hours_worked == 0

  * Positive wages
    gen poswage_no0 = hourly_wage_no0 if hourly_wage_no0 > 0

  * winsorize 1% and 99% percentiles
    gen hourly_adj_wage_no0 = hourly_wage_no0
    sum hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0, d
    replace hourly_adj_wage_no0 = r(p99) if hourly_adj_wage_no0 > r(p99) & !mi(hourly_adj_wage_no0)
    replace hourly_adj_wage_no0 = r(p1) if hourly_adj_wage_no0 < r(p1) & !mi(hourly_adj_wage_no0)
    replace hourly_adj_wage_no0 = hourly_adj_wage_no0/1000 
    la var hourly_adj_wage_no0 "Adjusted Hourly Wage (no zeroes, in 1000s of rupiah, winsorized of 1st and 99th pct)"    
    sum hourly_adj_wage_no0, d
    gen poswage_adj_no0 = hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0

  * Household earnings
    bys hh_id_s: egen mi_ind_earnings = max(mi(ind_earnings))
    bys hh_id_s: egen hh_earnings = total(ind_earnings) if mi_ind_earnings ==0
  	gen hh_pos_earnings = hh_earnings if hh_earnings > 0

  	gen hh_adj_earnings = hh_earnings // no adjustment here?
    replace hh_adj_earnings = hh_adj_earnings / 1000 
    la var hh_adj_earnings "Adjusted Hourly Wage (in 1000s of rupiah)"      
  	sum hh_adj_earnings, d
  	gen hh_pos_adj_earnings = hh_adj_earnings if hh_adj_earnings > 0

  	gen hh_earnings_pos = hh_earnings > 0 if !mi(hh_earnings)

    // Exchange rate: 14,403.60 rp/$
    // median monthly salary (assuming 40 hours/day, 5 days/week)
    di `r(p50)' * 4 * 5 * 8
    di `r(p50)' * 4 * 5 * 8 / 14400

  * imputed wage (using occupation, field, province, age, and gender)
    summ hourly_wage
    tab occupation
    tab business_field
    tab kode_prov
    summ age_sak
    tab female
    reg hourly_wage i.occupation i.business_field i.kode_prov i.age_sak female, vce(robust)
    predict wage_impute if hourly_wage != .
    summ wage_impute hourly_wage
    corr wage_impute hourly_wage

  * change in wage from before covid?
    tab r14b, m
    gen covid_wage_loss = r14b == 2

  * work tenure
    tab r15a1, m
    tab r15a2, m // are the 9999s don't knows?
    tab r15a2 if r15a1 == 99
    tab r15a1 if r15a2 == 9999

    * start year
    gen year_tenure = r15a2
    replace year_tenure = . if year_tenure == 0 | year_tenure == 9999
    tab year_tenure if employed == 1, m

    * start month
    gen month_tenure = r15a1
    replace month_tenure = . if month_tenure == 0
    tab month_tenure if employed == 1, m

    // set to middle of year if month missing but year is not
    replace month_tenure = 6 if month_tenure == 99 & year_tenure != .
    replace month_tenure = . if month_tenure == 99 & year_tenure == .
    tab month_tenure if employed == 1, m

    * create start date
    gen start_date = ym(year_tenure, month_tenure)
    format start_date %tm
    summ start_date, format
    count if start_date == . & employed == 1

    * create tenure in months
    summ start_date, format
    count if start_date < tm(2020m8)
    gen tenure_months = tm(2020m8) - start_date
    summ tenure_months, d
    di `r(max)'/12
    count if tenure_months == . & employed == 1
    hist tenure_months

    * check
    gen age_months = 12 * age_sak
    // subtract 48 months to get number of months one has been at least 4 years old (approximately)
    replace age_months = age_months - 48
    // check that respondents do not report being at same job since from before they were 4 years old
    summ age_months
    cap noi assert tenure_months < age_months if tenure_months != .
    rename tenure_months tenure_months_raw
    gen tenure_months = tenure_months_raw
    replace tenure_months = . if tenure_months >= age_months
    drop year_tenure month_tenure age_months
    summ tenure_months*
    tab tenure_months if employed == 1, m

  * use internet at work
    tab r17a, m
    // make missing if not working
    gen use_internet_work = r17a == 1 if r17a != 0
    replace use_internet_work = 0 if employed == 0 & mi(use_internet_work)
    tab use_internet_work

  * had more than one job
    tab r20a, m
    gen multiple_jobs = r20a == 1
    tab multiple_jobs, m

  * job search/business creation
    tab r22a, m
    gen looking_job = r22a == 1

    tab r22b, m
    gen prep_new_bus = r22b == 1

    gen activities_job = inlist(1, r24a, r24b, r24c, r24d, r24e, r24f)
    label var activities_job "Any effort to find a job"

    * Note: 1 = yes, 2 = no, 0 = not asked
    gen reg_job_mkt = r24a == 1
    gen contact_company = r24b == 1
    gen advert_online = r24c == 1
    gen search_network = r24d == 1
    gen cap_loc_lic = r24e == 1
    gen any_attempt = r24f == 1

    tab r25a, m

    tab employed, m
    gen labor_force = employed == 1 | looking_job == 1 | prep_new_bus == 1
    tab labor_force, m

  * anyone in HH own business with employees?
    gen with_emp = inlist(employment_status, 2,3)

  *bys hh_id: egen with_emp_HH = max(with_emp)
    bys hh_id_s: egen with_emp_HH_S = max(with_emp)

  * Actively starting a new business (22B) --this we can do for anyone in household
  	bys hh_id_s: egen prep_bus_hh = max(prep_new_bus)
  	bys hh_id_s: egen hh_emp_self = max(self_employed)

  /*----------------------------------------------------*/
              /* Section: Kartu Prakerja */
  /*----------------------------------------------------*/

    * do you know of kartu prakerja
    tab r34a, m
    gen prakerja_familiar = r34a == 1

    * did you sign up for kartu prakerja?
    tab r34b, m
    gen report_applied = r34b == 1
    tab report_applied

    * main reason signed up for kartu prakerja
    tab r34c, m
    gen reason_prakerja = r34c if r34c != 0
    la def reasons 1 "Skills" 2 "Incentives" 3 "Fill spare time" 4 "Join friends/try repeatedly"  5 "Free registration" 6 "Other", replace
    la val reason_prakerja reasons
    tab reason_prakerja

    * selected for kartu prakerja
    tab r34d, m
    gen report_selected = r34d == 1
    tab report_applied report_selected

    * completed kartu prakerja training
    tab r34e, m
    gen complete_pk_train = r34e == 1
    tab complete_pk_train, m

    * kartu prakerja improved skills
    tab r34f, m
    gen pk_improve_skill = r34f == 1
    tab pk_improve_skill, m

    * did you receive kartu prakerja incentives?
    tab r34g, m
    gen receive_pk_aid = r34g == 1

    * what are KP benefits spent on?
    tab r34h1, m
    gen use_pk_aid_needs = r34h1 == 1 if r34h1 != 0

    tab r34h2, m
    gen use_pk_aid_business = r34h2 == 1 if r34h2 != 0

    tab r34h3, m
    gen use_pk_aid_debt = r34h3 == 1 if r34h3 != 0

    tab r34h4, m
    gen use_pk_aid_save = r34h4 == 1 if r34h4 != 0

    tab r34h5, m
    gen use_pk_aid_other = r34h5 == 1 if r34h5 != 0
    summ use_pk_aid*

  * new job since the date you registered for Prakerja (since batch 1 will be fine) (we know date you started your job in 15a -- note this is only for main job)
  * batch 1 starts 11 April 2020
    gen new_job = start_date - tm(2020m4) >= 0 if !mi(start_date)
    replace new_job = 0 if business_owner == 1 | employed == 0
    tab new_job, m

  * new business since start of batch
    gen new_start = start_date - tm(2020m4) >= 0 if !mi(start_date)
    replace new_start = 0 if business_owner != 1 | employed == 0
    tab new_start, m

  ******************************************************
* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates 
	drop if anon_id4_dupe != 0 & !missing(anon_id4)
	drop anon_id4_dupe

/*----------------------------------------------------*/
              * Export cleaned data
/*----------------------------------------------------*/
  
  datasignature 
  if "`r(datasignature)'" == "203388:267(71654):1580143782:2642132846" {
    save "$KP_deid_sakernas/Clean/sak_feb21_deid_clean.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
  

/*----------------------------------------------------*/
            * Merge with admin data and export
/*----------------------------------------------------*/

  keep if !mi(userid_bps) | !mi(anon_id4)

  drop r? r?? r??? r???? k? klasifikas jlh_art* k5_* r7_* r13* r19* r29* r16* r28* r30* date_incentive

  gen sak_round = 6


* merge
  preserve
	keep if !missing(anon_id4)
	tempfile sak_anon_id
	save `sak_anon_id'
  restore

  use "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", clear

  merge m:1 anon_id4 using `sak_anon_id', nogen keep(3)


	compress
  datasignature 
  if "`r(datasignature)'" == "52073:206(72091):981123156:2111611440" {
    save "$KP_deid_sakernas/Clean/sak_feb21_deid_clean_merged.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
	

  // done
